date: 2018-10-27

R setup

knitr::opts_chunk$set(echo = TRUE, fig.width = 9, fig.height = 9)

library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.8.0     ✔ stringr 1.3.0
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(knitr)
library(readxl)
library(moments)
library(epiR)
## Warning: package 'epiR' was built under R version 3.4.4
## Loading required package: survival
## Package epiR 0.9-97 is loaded
## Type help(epi.about) for summary information
## 
library(e1071)
## 
## Attaching package: 'e1071'
## The following objects are masked from 'package:moments':
## 
##     kurtosis, moment, skewness
library(lsmeans)
## Warning: package 'lsmeans' was built under R version 3.4.4
## The 'lsmeans' package is being deprecated.
## Users are encouraged to switch to 'emmeans'.
## See help('transition') for more information, including how
## to convert 'lsmeans' objects and scripts to work with 'emmeans'.
library(lmSupport)
## Warning: package 'lmSupport' was built under R version 3.4.4
library(ppcor)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(tableone)
## Warning: package 'tableone' was built under R version 3.4.4
library(sjPlot)
## Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
## TMB was built with Matrix version 1.2.12
## Current Matrix version is 1.2.13
## Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.1. https://CRAN.R-project.org/package=stargazer
library(car)
## Warning: package 'car' was built under R version 3.4.4
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library(leaps)

Load data

rm(list = ls())

## define directory
analysisDir <- '/Users/matthewmoll/Documents/Fellowship/MPH/Fall2018/BST213_regression/homework/'
  
## Read in data
cost <- read_excel("/Users/matthewmoll/Documents/Fellowship/MPH/Fall2018/BST213_regression/homework/cost.xls")

Data structure

head(cost)
## # A tibble: 6 x 14
##   outcosts2 ptage   ssi panic anxiety married female educat  iadl social
##       <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl>  <dbl>  <dbl> <dbl>  <dbl>
## 1       83.  38.9  1.00    0.      0.      1.     1.     5. 100.    100.
## 2     2162.  53.3  1.00    0.      0.      0.     1.     2.  91.7   100.
## 3     2203.  42.7  1.08    0.      0.      0.     1.     1.  75.0   100.
## 4        0.  70.2  1.15    0.      0.      0.     1.     1.  46.7   100.
## 5     4050.  41.1  1.00    0.      0.      1.     1.     3. 100.    100.
## 6      815.  38.4  1.15    0.      0.      0.     1.     2.  83.3   100.
## # ... with 4 more variables: racecat <dbl>, whiteleycat <dbl>,
## #   charlson <dbl>, depcat <dbl>
str(cost)
## Classes 'tbl_df', 'tbl' and 'data.frame':    461 obs. of  14 variables:
##  $ outcosts2  : num  83 2162 2203 0 4050 ...
##  $ ptage      : num  38.9 53.3 42.7 70.2 41.1 ...
##  $ ssi        : num  1 1 1.08 1.15 1 ...
##  $ panic      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anxiety    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ married    : num  1 0 0 0 1 0 1 0 0 0 ...
##  $ female     : num  1 1 1 1 1 1 0 1 0 0 ...
##  $ educat     : num  5 2 1 1 3 2 5 3 2 2 ...
##  $ iadl       : num  100 91.7 75 46.7 100 ...
##  $ social     : num  100 100 100 100 100 ...
##  $ racecat    : num  4 2 2 1 1 2 4 2 1 1 ...
##  $ whiteleycat: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ charlson   : num  0 1 1 2 0 3 0 2 0 1 ...
##  $ depcat     : num  3 3 3 3 3 3 3 3 3 3 ...
sapply(cost, summary)
## $outcosts2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0     307    1016    1836    2463   11909 
## 
## $ptage
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   18.96   32.24   44.62   45.16   55.21   89.68       1 
## 
## $ssi
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.231   1.538   1.738   2.077   5.000       6 
## 
## $panic
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.06291 0.00000 1.00000 
## 
## $anxiety
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.06508 0.00000 1.00000 
## 
## $married
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.4096  1.0000  1.0000       2 
## 
## $female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   0.744   1.000   1.000 
## 
## $educat
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   2.000   3.000   3.359   5.000   5.000       1 
## 
## $iadl
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   72.22   94.44   81.82  100.00  100.00 
## 
## $social
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00  100.00  100.00   89.86  100.00  100.00       2 
## 
## $racecat
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   2.297   4.000   4.000 
## 
## $whiteleycat
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.456   2.000   3.000 
## 
## $charlson
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.5662  1.0000  7.0000 
## 
## $depcat
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    3.00    3.00    2.72    3.00    3.00

Define variables

The goal for this assignment is to build a linear regression model using the priniciples discussed in class, and then write it into a methods and results section of a manuscript.


removevars <- c()

## Define response variables
responseVars <- c('outcosts2')
responseVars
## [1] "outcosts2"
## Define explanatory variables
explanVars <- names(cost)[!names(cost) %in% c("outcosts2")]
explanVars
##  [1] "ptage"       "ssi"         "panic"       "anxiety"     "married"    
##  [6] "female"      "educat"      "iadl"        "social"      "racecat"    
## [11] "whiteleycat" "charlson"    "depcat"
## List of quantitative variables
quantVars <- names(cost[!names(cost) %in% removevars &
                                  sapply(cost, function(x)
                                      length(levels(as.factor(x)))>7)])
quantVars
## [1] "outcosts2" "ptage"     "ssi"       "iadl"      "social"    "charlson"
## List of binary variables
binVars <- names(cost[!names(cost) %in% removevars &
                                sapply(cost, function(x)
                                    length(levels(as.factor(x)))==2)])
binVars
## [1] "panic"   "anxiety" "married" "female"
## List of categorical variables

catVars <- names(cost[!names(cost) %in% c(removevars,binVars) &
                                sapply(cost, function(x)
                                    length(levels(as.factor(x)))<=7 & length(levels(as.factor(x)))>1)])
catVars
## [1] "educat"      "racecat"     "whiteleycat" "depcat"
## Get Intersection between quantVars and explanVars and responseVars
explanVars.quant <- intersect(explanVars,quantVars)
responseVars.quant <- intersect(responseVars,quantVars)

## Get intersection between categorical and binary variables with explanatory and response vars
explanVars.cat <- intersect(explanVars,catVars)
explanVars.cat <- explanVars.cat[!explanVars.cat %in% c(binVars)]

responseVars.cat <- intersect(responseVars, catVars)
responseVars.cat <- responseVars.cat[!responseVars.cat %in% c(binVars)]


## Get binary explanatory and response variables
explanVars.bin <- intersect(explanVars, binVars)
responseVars.bin <- intersect(responseVars, binVars)

## View data ranges
str(cost)
## Classes 'tbl_df', 'tbl' and 'data.frame':    461 obs. of  14 variables:
##  $ outcosts2  : num  83 2162 2203 0 4050 ...
##  $ ptage      : num  38.9 53.3 42.7 70.2 41.1 ...
##  $ ssi        : num  1 1 1.08 1.15 1 ...
##  $ panic      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anxiety    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ married    : num  1 0 0 0 1 0 1 0 0 0 ...
##  $ female     : num  1 1 1 1 1 1 0 1 0 0 ...
##  $ educat     : num  5 2 1 1 3 2 5 3 2 2 ...
##  $ iadl       : num  100 91.7 75 46.7 100 ...
##  $ social     : num  100 100 100 100 100 ...
##  $ racecat    : num  4 2 2 1 1 2 4 2 1 1 ...
##  $ whiteleycat: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ charlson   : num  0 1 1 2 0 3 0 2 0 1 ...
##  $ depcat     : num  3 3 3 3 3 3 3 3 3 3 ...
sapply(cost, summary)
## $outcosts2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0     307    1016    1836    2463   11909 
## 
## $ptage
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   18.96   32.24   44.62   45.16   55.21   89.68       1 
## 
## $ssi
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.231   1.538   1.738   2.077   5.000       6 
## 
## $panic
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.06291 0.00000 1.00000 
## 
## $anxiety
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.06508 0.00000 1.00000 
## 
## $married
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.4096  1.0000  1.0000       2 
## 
## $female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   0.744   1.000   1.000 
## 
## $educat
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   2.000   3.000   3.359   5.000   5.000       1 
## 
## $iadl
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   72.22   94.44   81.82  100.00  100.00 
## 
## $social
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00  100.00  100.00   89.86  100.00  100.00       2 
## 
## $racecat
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   2.297   4.000   4.000 
## 
## $whiteleycat
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.456   2.000   3.000 
## 
## $charlson
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.5662  1.0000  7.0000 
## 
## $depcat
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    3.00    3.00    2.72    3.00    3.00


Normality assessment

cost_quant <- as.data.frame(cost %>% dplyr::select(quantVars))


normalityfun <- function(dataset) {
  require(moments)
  require(e1071)
  
  print(paste0("Summary Statistics: ", names(dataset)[i]))
  print(paste0("Mean: ", mean(as.numeric(dataset[,i]), na.rm = T)))
  print(paste0("Standard Deviation: ",sd(as.numeric(dataset[,i]), na.rm = T)))
  print(paste0("Median: ", median(as.numeric(dataset[,i]), na.rm = T)))
  print(paste0("Skewness: ", skewness(as.numeric(dataset[,i]), na.rm = T)))
  print(paste0("Kurtosis: ", e1071::kurtosis(as.numeric(dataset[,i]), type = 2)))
  print(paste0("Shapiro-Wilk Test: ", shapiro.test(as.numeric(dataset[,i]))$p.value))
  print(" ")
  print(" ")
  
  qqnorm(as.numeric(dataset[,i]), main = paste0("Normal Q-Q plot: ",names(dataset)[i]))
  qqline(as.numeric(dataset[,i]), col = 2)
  
  
}



for(i in 1:length(quantVars)) {
  
    print(ggplot() + 
            geom_histogram(data = cost_quant, fill = "blue", alpha = 0.5, mapping = 
                             aes(x = get(quantVars[i]))) +
                xlab(quantVars[i]) + labs(title = "Histogram")
    
  )
  
    normalityfun(cost_quant)
  
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "Summary Statistics: outcosts2"
## [1] "Mean: 1836.48373101952"
## [1] "Standard Deviation: 2195.4502208015"
## [1] "Median: 1016"
## [1] "Skewness: 1.86911992818483"
## [1] "Kurtosis: 3.71673113110437"
## [1] "Shapiro-Wilk Test: 1.32694421506979e-24"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).

## [1] "Summary Statistics: ptage"
## [1] "Mean: 45.1636818141237"
## [1] "Standard Deviation: 15.4858207820523"
## [1] "Median: 44.621492128679"
## [1] "Skewness: 0.422861613908007"
## [1] "Kurtosis: NA"
## [1] "Shapiro-Wilk Test: 9.58087935521775e-08"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6 rows containing non-finite values (stat_bin).

## [1] "Summary Statistics: ssi"
## [1] "Mean: 1.73845513460898"
## [1] "Standard Deviation: 0.693705411185324"
## [1] "Median: 1.53846153846154"
## [1] "Skewness: 1.35710574430674"
## [1] "Kurtosis: NA"
## [1] "Shapiro-Wilk Test: 1.83131200837982e-19"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "Summary Statistics: iadl"
## [1] "Mean: 81.8185104844541"
## [1] "Standard Deviation: 25.5994105151121"
## [1] "Median: 94.4444444444444"
## [1] "Skewness: -1.54182530496445"
## [1] "Kurtosis: 1.54732926242677"
## [1] "Shapiro-Wilk Test: 3.82337675082766e-26"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).

## [1] "Summary Statistics: social"
## [1] "Mean: 89.8571774388768"
## [1] "Standard Deviation: 22.5952089828678"
## [1] "Median: 100"
## [1] "Skewness: -2.50691098205916"
## [1] "Kurtosis: NA"
## [1] "Shapiro-Wilk Test: 1.85402296574893e-33"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "Summary Statistics: charlson"
## [1] "Mean: 0.566160520607375"
## [1] "Standard Deviation: 1.07464059120437"
## [1] "Median: 0"
## [1] "Skewness: 2.57120849300461"
## [1] "Kurtosis: 8.03575657287451"
## [1] "Shapiro-Wilk Test: 1.66716672242903e-31"
## [1] " "
## [1] " "


The only variable that looks close to normal is patient age, which we may be able to use parametric testing by invoking the central limit theorem. Otherwise, we will report median and interquartile ranges.


Missingness

## assess for and remove > 10% missingness
## Set threshold
thr <- 0.1*length(cost$outcosts2)

sapply(cost, function(x) ifelse(sum(is.na(x)) > thr, 1, 0))
##   outcosts2       ptage         ssi       panic     anxiety     married 
##           0           0           0           0           0           0 
##      female      educat        iadl      social     racecat whiteleycat 
##           0           0           0           0           0           0 
##    charlson      depcat 
##           0           0
sapply(cost,function(x) sum(is.na(x)))
##   outcosts2       ptage         ssi       panic     anxiety     married 
##           0           1           6           0           0           2 
##      female      educat        iadl      social     racecat whiteleycat 
##           0           1           0           2           0           0 
##    charlson      depcat 
##           0           0
## Create complete dataset of phenotypic variables
cost <- na.omit(cost)

No variables were removed for missingness. However, ssi has the greatest missingness.

Table 1: Demographics

## Add labels to table

labelled::var_label(cost$outcosts2) <- "Outpatient costs ($)"
labelled::var_label(cost$ptage) <- "Age in years"
labelled::var_label(cost$ssi) <- "Somatic Symptom Index (No. (%))"
labelled::var_label(cost$panic) <- "Panic (No. (%))"
labelled::var_label(cost$anxiety) <- "Anxiety (No. (%))"
labelled::var_label(cost$married) <- "Married (No. (%))"
labelled::var_label(cost$female) <- "Female (No. (%))"
labelled::var_label(cost$educat) <- "Education Level (No. (%))"
labelled::var_label(cost$iadl) <- "IADL score"
labelled::var_label(cost$social) <- "Social Activities Score"
labelled::var_label(cost$racecat) <- "Race (No. (%))"
labelled::var_label(cost$whiteleycat) <- "Whiteley Category"
labelled::var_label(cost$charlson) <- "Charlson index"
labelled::var_label(cost$depcat) <- "Depression Category (No. (%))"




## Group variables

demovars <- names(cost)
demovars
##  [1] "outcosts2"   "ptage"       "ssi"         "panic"       "anxiety"    
##  [6] "married"     "female"      "educat"      "iadl"        "social"     
## [11] "racecat"     "whiteleycat" "charlson"    "depcat"
varsToFactor <- c(catVars, binVars) 

nonNormalVars <- names(cost_quant)[!names(cost_quant) %in% c("ptage")]

## Table one

tableOne <- CreateTableOne(vars = demovars, data = cost, factorVars = varsToFactor, testNonNormal = nonNormalVars)
tableOne
##                        
##                         Overall          
##   n                         449          
##   outcosts2 (mean (sd)) 1829.86 (2205.83)
##   ptage (mean (sd))       45.22 (15.55)  
##   ssi (mean (sd))          1.74 (0.69)   
##   panic = 1 (%)              28 ( 6.2)   
##   anxiety = 1 (%)            28 ( 6.2)   
##   married = 1 (%)           184 (41.0)   
##   female = 1 (%)            332 (73.9)   
##   educat (%)                             
##      1                       39 ( 8.7)   
##      2                      100 (22.3)   
##      3                       91 (20.3)   
##      4                       94 (20.9)   
##      5                      125 (27.8)   
##   iadl (mean (sd))        81.80 (25.72)  
##   social (mean (sd))      89.73 (22.78)  
##   racecat (%)                            
##      1                      174 (38.8)   
##      2                       97 (21.6)   
##      3                       48 (10.7)   
##      4                      130 (29.0)   
##   whiteleycat (%)                        
##      1                      297 (66.1)   
##      2                      100 (22.3)   
##      3                       52 (11.6)   
##   charlson (mean (sd))     0.57 (1.07)   
##   depcat (%)                             
##      1                       45 (10.0)   
##      2                       36 ( 8.0)   
##      3                      368 (82.0)
## Save as csv and word document
tableOne_out <- print(tableOne,quote = FALSE, noSpaces = TRUE, printToggle = FALSE, varLabel = TRUE, nonnormal = nonNormalVars)


write.csv(tableOne_out, file = paste0(analysisDir,"TableOne.csv"))
tableOne_out <- fread(paste0(analysisDir, "TableOne.csv"))

colnames(tableOne_out) <- c("","")

tab_df(tableOne_out, file = paste0(analysisDir, "TableOne.doc"))
n 449
Outpatient costs ($) (median [IQR]) 998.00 [305.00, 2463.00]
Age in years (mean (sd)) 45.22 (15.55)
Somatic Symptom Index (No. (%)) (median [IQR]) 1.54 [1.23, 2.08]
Panic (No. (%)) = 1 (%) 28 (6.2)
Anxiety (No. (%)) = 1 (%) 28 (6.2)
Married (No. (%)) = 1 (%) 184 (41.0)
Female (No. (%)) = 1 (%) 332 (73.9)
Education Level (No. (%)) (%)
1 39 (8.7)
2 100 (22.3)
3 91 (20.3)
4 94 (20.9)
5 125 (27.8)
IADL score (median [IQR]) 94.44 [72.22, 100.00]
Social Activities Score (median [IQR]) 100.00 [100.00, 100.00]
Race (No. (%)) (%)
1 174 (38.8)
2 97 (21.6)
3 48 (10.7)
4 130 (29.0)
Whiteley Category (%)
1 297 (66.1)
2 100 (22.3)
3 52 (11.6)
Charlson index (median [IQR]) 0.00 [0.00, 1.00]
Depression Category (No. (%)) (%)
1 45 (10.0)
2 36 (8.0)
3 368 (82.0)

Scatterplots and correlation coefficients

## Scatterplots and correlation coefficients

for (i in 1:length(explanVars.quant)) {
  
  for (j in 1:length(responseVars.quant)) {
 
     print(paste0("Explanatory var: ", explanVars.quant[i]))
     print(paste0("Response var: ", responseVars.quant[j]))
     
     cormod <- cor.test(as.data.frame(cost)[,explanVars.quant[i]],
                                    as.data.frame(cost)[,responseVars.quant[j]], 
                        method = "spearman")
     
     print(paste0("Spearman Correlation Coefficient: ", signif(cormod$estimate,5)))
     print(paste0("p-value: ", signif(cormod$p.value,5)))
     
     print(" ")
     print(" ")
  
     print(ggplot(data = cost) + 
             geom_point(mapping = aes(x = get(explanVars.quant[i]), 
                                      y = get(responseVars.quant[j]))) +
             geom_smooth(mapping = aes(x = get(explanVars.quant[i]), 
                                       y = get(responseVars.quant[j])), method = 'lm') +
             xlab(explanVars.quant[i]) + ylab(responseVars.quant[j]) +
               labs(subtitle =
                      paste0(paste0("r-value: ", signif(cormod$estimate,5)),", ",
                             paste0("p-value: ", signif(cormod$p.value,5))))) 
     
     ## plot residuals
     print(ggplot(data = lm(get(responseVars.quant[j])~get(explanVars.quant[i]), data = cost)) + 
             geom_point(aes(x = .fitted, y = .resid)) + 
             labs(title = paste0("Residuals vs fitted: ",explanVars.quant[i]))) # fitted vs residuals
     
    print(ggplot(data = lm(get(responseVars.quant[j])~get(explanVars.quant[i]), data = cost)) + 
            geom_histogram(aes(x = .resid)) + 
            labs(title = paste0("Residuals histogram: ",explanVars.quant[i])))       
     

    }
}
## [1] "Explanatory var: ptage"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties
## [1] "Spearman Correlation Coefficient: 0.12543"
## [1] "p-value: 0.0077943"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: ssi"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: 0.28145"
## [1] "p-value: 1.2772e-09"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: iadl"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: -0.28202"
## [1] "p-value: 1.1791e-09"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: social"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: -0.16497"
## [1] "p-value: 0.00044822"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: charlson"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: 0.32663"
## [1] "p-value: 1.2696e-12"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


All vars: Scatterplots and correlation coefficients

## Scatterplots and correlation coefficients

for (i in 1:length(explanVars)) {
  
  for (j in 1:length(responseVars.quant)) {
 
     print(paste0("Explanatory var: ", explanVars[i]))
     print(paste0("Response var: ", responseVars.quant[j]))
     
     cormod <- cor.test(as.data.frame(cost)[,explanVars[i]],
                                    as.data.frame(cost)[,responseVars.quant[j]], 
                        method = "spearman")
     
     print(paste0("Spearman Correlation Coefficient: ", signif(cormod$estimate,5)))
     print(paste0("p-value: ", signif(cormod$p.value,5)))
     
     print(" ")
     print(" ")
  
     print(ggplot(data = cost) + 
             geom_point(mapping = aes(x = get(explanVars[i]), 
                                      y = get(responseVars.quant[j]))) +
             geom_smooth(mapping = aes(x = get(explanVars[i]), 
                                       y = get(responseVars.quant[j])), method = 'lm') +
             xlab(explanVars[i]) + ylab(responseVars.quant[j]) +
               labs(subtitle =
                      paste0(paste0("r-value: ", signif(cormod$estimate,5)),", ",
                             paste0("p-value: ", signif(cormod$p.value,5))))) 
     
     ## plot residuals
     print(ggplot(data = lm(get(responseVars.quant[j])~get(explanVars[i]), data = cost)) + 
             geom_point(aes(x = .fitted, y = .resid)) + 
             labs(title = paste0("Residuals vs fitted: ",explanVars[i]))) # fitted vs residuals
     
    print(ggplot(data = lm(get(responseVars.quant[j])~get(explanVars[i]), data = cost)) + 
            geom_histogram(aes(x = .resid)) + 
            labs(title = paste0("Residuals histogram: ",explanVars[i])))       
     

    }
}
## [1] "Explanatory var: ptage"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars[i]],
## as.data.frame(cost)[, : Cannot compute exact p-value with ties
## [1] "Spearman Correlation Coefficient: 0.12543"
## [1] "p-value: 0.0077943"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: ssi"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars[i]],
## as.data.frame(cost)[, : Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: 0.28145"
## [1] "p-value: 1.2772e-09"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: panic"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars[i]],
## as.data.frame(cost)[, : Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: 0.10807"
## [1] "p-value: 0.021999"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: anxiety"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars[i]],
## as.data.frame(cost)[, : Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: 0.10843"
## [1] "p-value: 0.021564"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: married"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars[i]],
## as.data.frame(cost)[, : Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: -0.098379"
## [1] "p-value: 0.037172"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: female"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars[i]],
## as.data.frame(cost)[, : Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: 0.13685"
## [1] "p-value: 0.003668"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: educat"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars[i]],
## as.data.frame(cost)[, : Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: -0.23303"
## [1] "p-value: 5.9447e-07"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: iadl"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars[i]],
## as.data.frame(cost)[, : Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: -0.28202"
## [1] "p-value: 1.1791e-09"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: social"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars[i]],
## as.data.frame(cost)[, : Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: -0.16497"
## [1] "p-value: 0.00044822"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: racecat"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars[i]],
## as.data.frame(cost)[, : Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: -0.26825"
## [1] "p-value: 7.7228e-09"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: whiteleycat"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars[i]],
## as.data.frame(cost)[, : Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: 0.2111"
## [1] "p-value: 6.4357e-06"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: charlson"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars[i]],
## as.data.frame(cost)[, : Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: 0.32663"
## [1] "p-value: 1.2696e-12"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: depcat"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars[i]],
## as.data.frame(cost)[, : Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: -0.2717"
## [1] "p-value: 4.8674e-09"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Variable selection

# 
# FS <- regsubsets(as.formula(LinEq), data = cost, method = "forward", force.in = c("depcat","charlson"))
# 
# str(summary(FS))
# str(FS)
# 
# summary(FS)$rss

set.seed(123)

null <- lm(outcosts2~1, data = cost)
full <- lm(outcosts2~., data = cost)

FS <- step(null, scope=list(lower=null, upper=full), direction="forward")
## Start:  AIC=6914.57
## outcosts2 ~ 1
## 
##               Df Sum of Sq        RSS    AIC
## + charlson     1 240985083 1938832720 6864.0
## + iadl         1 163770786 2016047017 6881.5
## + ssi          1 133484030 2046333773 6888.2
## + racecat      1 124486863 2055330940 6890.2
## + depcat       1  96034963 2083782841 6896.3
## + educat       1  92820171 2086997633 6897.0
## + social       1  81518772 2098299031 6899.5
## + whiteleycat  1  80410942 2099406861 6899.7
## + panic        1  34614794 2145203009 6909.4
## + anxiety      1  24581357 2155236447 6911.5
## + ptage        1  20088860 2159728943 6912.4
## + female       1  13917217 2165900586 6913.7
## + married      1  13095811 2166721993 6913.9
## <none>                     2179817803 6914.6
## 
## Step:  AIC=6863.97
## outcosts2 ~ charlson
## 
##               Df Sum of Sq        RSS    AIC
## + ssi          1  85455902 1853376818 6845.7
## + iadl         1  82639449 1856193271 6846.4
## + racecat      1  76178757 1862653963 6848.0
## + depcat       1  62334119 1876498601 6851.3
## + whiteleycat  1  52109860 1886722860 6853.7
## + educat       1  49609668 1889223052 6854.3
## + social       1  45657600 1893175121 6855.3
## + anxiety      1  23073440 1915759280 6860.6
## + panic        1  20666597 1918166124 6861.2
## + female       1  19857339 1918975382 6861.3
## + married      1  18252581 1920580139 6861.7
## <none>                     1938832720 6864.0
## + ptage        1    185287 1938647434 6865.9
## 
## Step:  AIC=6845.73
## outcosts2 ~ charlson + ssi
## 
##               Df Sum of Sq        RSS    AIC
## + racecat      1  60285654 1793091165 6832.9
## + iadl         1  21990226 1831386592 6842.4
## + educat       1  20888173 1832488646 6842.6
## + depcat       1  20482683 1832894136 6842.7
## + female       1  13988501 1839388318 6844.3
## + married      1   9239284 1844137534 6845.5
## <none>                     1853376818 6845.7
## + whiteleycat  1   7309692 1846067127 6846.0
## + social       1   5917369 1847459450 6846.3
## + anxiety      1   4488975 1848887843 6846.6
## + panic        1   2688754 1850688064 6847.1
## + ptage        1   1212095 1852164723 6847.4
## 
## Step:  AIC=6832.88
## outcosts2 ~ charlson + ssi + racecat
## 
##               Df Sum of Sq        RSS    AIC
## + depcat       1  18609227 1774481937 6830.2
## + educat       1  14815977 1778275188 6831.2
## + iadl         1  14741752 1778349413 6831.2
## + female       1  13779113 1779312051 6831.4
## + married      1  12542302 1780548863 6831.7
## <none>                     1793091165 6832.9
## + whiteleycat  1   6267831 1786823334 6833.3
## + ptage        1   6075168 1787015997 6833.4
## + social       1   5680601 1787410564 6833.5
## + anxiety      1   4302180 1788788984 6833.8
## + panic        1   3881596 1789209569 6833.9
## 
## Step:  AIC=6830.2
## outcosts2 ~ charlson + ssi + racecat + depcat
## 
##               Df Sum of Sq        RSS    AIC
## + iadl         1  12767888 1761714049 6829.0
## + female       1  10797743 1763684194 6829.5
## + married      1  10520394 1763961543 6829.5
## + educat       1   9655606 1764826331 6829.7
## <none>                     1774481937 6830.2
## + ptage        1   3827942 1770653995 6831.2
## + social       1   2152371 1772329566 6831.7
## + whiteleycat  1   1062997 1773418940 6831.9
## + panic        1    746116 1773735821 6832.0
## + anxiety      1    375237 1774106700 6832.1
## 
## Step:  AIC=6828.96
## outcosts2 ~ charlson + ssi + racecat + depcat + iadl
## 
##               Df Sum of Sq        RSS    AIC
## + female       1   9855027 1751859022 6828.4
## + married      1   8715904 1752998145 6828.7
## <none>                     1761714049 6829.0
## + ptage        1   6217321 1755496729 6829.4
## + educat       1   6146735 1755567315 6829.4
## + social       1   1390391 1760323659 6830.6
## + whiteleycat  1    895128 1760818921 6830.7
## + panic        1    509845 1761204204 6830.8
## + anxiety      1    503253 1761210796 6830.8
## 
## Step:  AIC=6828.44
## outcosts2 ~ charlson + ssi + racecat + depcat + iadl + female
## 
##               Df Sum of Sq        RSS    AIC
## <none>                     1751859022 6828.4
## + educat       1   5304184 1746554838 6829.1
## + married      1   5231307 1746627714 6829.1
## + ptage        1   4679154 1747179868 6829.2
## + social       1   1411315 1750447707 6830.1
## + anxiety      1    788709 1751070313 6830.2
## + whiteleycat  1    552418 1751306604 6830.3
## + panic        1    410648 1751448374 6830.3
## View best model
summary(FS)
## 
## Call:
## lm(formula = outcosts2 ~ charlson + ssi + racecat + depcat + 
##     iadl + female, data = cost)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4806.9 -1158.9  -529.6   669.9  8624.8 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2917.841    838.943   3.478 0.000555 ***
## charlson     534.604     91.734   5.828 1.08e-08 ***
## ssi          285.302    175.069   1.630 0.103888    
## racecat     -279.386     77.138  -3.622 0.000326 ***
## depcat      -309.718    164.689  -1.881 0.060680 .  
## iadl          -7.997      4.630  -1.727 0.084814 .  
## female       341.034    216.276   1.577 0.115545    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1991 on 442 degrees of freedom
## Multiple R-squared:  0.1963, Adjusted R-squared:  0.1854 
## F-statistic:    18 on 6 and 442 DF,  p-value: < 2.2e-16
## Create vector of selected variables
selectedVars <- rownames(summary(FS)$coefficients)
selectedVars <- selectedVars[!selectedVars %in% c("(Intercept)")]
selectedVars
## [1] "charlson" "ssi"      "racecat"  "depcat"   "iadl"     "female"
summary(FS)$coefficients
##                Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 2917.840536 838.942636  3.477998 5.552137e-04
## charlson     534.604127  91.733723  5.827782 1.083905e-08
## ssi          285.302090 175.069404  1.629651 1.038876e-01
## racecat     -279.386184  77.137852 -3.621908 3.263541e-04
## depcat      -309.717995 164.688760 -1.880626 6.068003e-02
## iadl          -7.996864   4.629716 -1.727290 8.481415e-02
## female       341.034392 216.275689  1.576850 1.155454e-01

Multicollinearity: variance inflation factors

LinEq <- paste0("outcosts2~",paste0(selectedVars, collapse = "+"))
LinEq
## [1] "outcosts2~charlson+ssi+racecat+depcat+iadl+female"
LinMod <- lm(as.formula(LinEq), data = cost)
summary(LinMod)
## 
## Call:
## lm(formula = as.formula(LinEq), data = cost)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4806.9 -1158.9  -529.6   669.9  8624.8 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2917.841    838.943   3.478 0.000555 ***
## charlson     534.604     91.734   5.828 1.08e-08 ***
## ssi          285.302    175.069   1.630 0.103888    
## racecat     -279.386     77.138  -3.622 0.000326 ***
## depcat      -309.718    164.689  -1.881 0.060680 .  
## iadl          -7.997      4.630  -1.727 0.084814 .  
## female       341.034    216.276   1.577 0.115545    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1991 on 442 degrees of freedom
## Multiple R-squared:  0.1963, Adjusted R-squared:  0.1854 
## F-statistic:    18 on 6 and 442 DF,  p-value: < 2.2e-16
vif(LinMod)
## charlson      ssi  racecat   depcat     iadl   female 
## 1.096209 1.670461 1.053903 1.236142 1.603067 1.020977
## Remove if VIF > 10 - none in this case

## Now create table for adjusted coefficients
coefTable <- 
  data.frame(
    variable = 
      rownames(summary(LinMod)$coefficients)[2:length(summary(LinMod)$coefficients[,1])], 
    coefficient = 
      as.numeric(summary(LinMod)$coefficients[2:length(summary(LinMod)$coefficients[,1]),1]), 
    LCI = 
      as.numeric(confint(LinMod)[2:length(summary(LinMod)$coefficients[,1]),1]), 
    UCI = as.numeric(confint(LinMod)[2:length(summary(LinMod)$coefficients[,1]),2]), 
    pvalue = 
               signif(as.numeric(summary(LinMod)$coefficients[2:length(summary(LinMod)$coefficients[,1]),4]),9))

kable(coefTable)
variable coefficient LCI UCI pvalue
charlson 534.604127 354.31566 714.892595 0.0000000
ssi 285.302090 -58.76979 629.373972 0.1038876
racecat -279.386184 -430.98872 -127.783646 0.0003264
depcat -309.717995 -633.38832 13.952333 0.0606800
iadl -7.996864 -17.09586 1.102129 0.0848141
female 341.034392 -84.02208 766.090863 0.1155454
## Now remove SSI which is likely an intermediary variable

LinEq.int <- paste0("outcosts2~",paste0(selectedVars[!selectedVars %in% c("ssi")], collapse = "+"))
LinEq.int
## [1] "outcosts2~charlson+racecat+depcat+iadl+female"
LinMod.int <- lm(as.formula(LinEq), data = cost)
summary(LinMod)
## 
## Call:
## lm(formula = as.formula(LinEq), data = cost)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4806.9 -1158.9  -529.6   669.9  8624.8 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2917.841    838.943   3.478 0.000555 ***
## charlson     534.604     91.734   5.828 1.08e-08 ***
## ssi          285.302    175.069   1.630 0.103888    
## racecat     -279.386     77.138  -3.622 0.000326 ***
## depcat      -309.718    164.689  -1.881 0.060680 .  
## iadl          -7.997      4.630  -1.727 0.084814 .  
## female       341.034    216.276   1.577 0.115545    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1991 on 442 degrees of freedom
## Multiple R-squared:  0.1963, Adjusted R-squared:  0.1854 
## F-statistic:    18 on 6 and 442 DF,  p-value: < 2.2e-16
vif(LinMod.int)
## charlson      ssi  racecat   depcat     iadl   female 
## 1.096209 1.670461 1.053903 1.236142 1.603067 1.020977

Univariate OLS

coefTable.uni <- data.frame(variable = character(),coefficient = numeric(), LCI = numeric(), UCI = numeric(), pvalue = numeric())

for(i in 1:length(selectedVars)) {
  
  
  LinEq.uni <- paste0("outcosts2~",selectedVars[i])
  LinEq.uni

  LinMod.uni <- lm(as.formula(LinEq.uni), data = cost)

  coefTable.uni <- rbind(coefTable.uni, data.frame(variable = selectedVars[i], 
                            coefficient = as.numeric(summary(LinMod.uni)$coefficients[2,1]), 
                            LCI = confint(LinMod.uni)[2,1], 
                            UCI = confint(LinMod.uni)[2,2], 
                            pvalue = signif(summary(LinMod.uni)$coefficients[2,4], 9)
                       ))

}


kable(coefTable.uni)
variable coefficient LCI UCI pvalue
charlson 683.18615 503.05618 863.31612 0.0000000
ssi 786.08278 499.98553 1072.18004 0.0000001
racecat -421.10505 -580.15788 -262.05222 0.0000003
depcat -729.13175 -1044.84251 -413.42099 0.0000073
iadl -23.50489 -31.17078 -15.83901 0.0000000
female 401.08591 -64.02046 866.19228 0.0908156

Table 2: coefficients

table2 <- bind_cols(coefTable.uni,coefTable)
table2 <- as.data.frame(sapply(table2 %>% dplyr::select(-variable,-variable1), function(x) signif(x, 5)))
kable(table2)
coefficient LCI UCI pvalue coefficient1 LCI1 UCI1 pvalue1
683.190 503.060 863.320 0.0000000 534.6000 354.320 714.8900 0.0000000
786.080 499.990 1072.200 0.0000001 285.3000 -58.770 629.3700 0.1038900
-421.110 -580.160 -262.050 0.0000003 -279.3900 -430.990 -127.7800 0.0003264
-729.130 -1044.800 -413.420 0.0000073 -309.7200 -633.390 13.9520 0.0606800
-23.505 -31.171 -15.839 0.0000000 -7.9969 -17.096 1.1021 0.0848140
401.090 -64.020 866.190 0.0908160 341.0300 -84.022 766.0900 0.1155500
## if p < 0.0001
table2[,c("pvalue","pvalue1")] <- sapply(table2[,c("pvalue","pvalue1")], function(x) ifelse(x < 0.0001, "< 0.0001",x))


## Create vector rownames and extract labels
variableNames <- selectedVars

variableNameLabels <- vector(length = length(variableNames))

for(y in 1:length(variableNameLabels)) {
  
  variableNameLabels[y] <- labelled::var_label(cost[,variableNames[y]])
}


## Collapse 95% CIs into cell with hazard ratios

table2 <- table2 %>% mutate(variable = variableNameLabels, 
                       Coef1 = paste0(coefficient," (",LCI,",",UCI,")"), 
                       Coef2 = paste0(coefficient1," (",LCI1,",",UCI1,")")) %>% dplyr::select(variable,Coef1,pvalue,Coef2,pvalue1)
## Warning: package 'bindrcpp' was built under R version 3.4.4
colnames(table2) <- c("Variable", "Unadjusted Coefficient (95% CI)","p-value","Adjusted Coefficient (95% CI)","p-value")

kable(table2)
Variable Unadjusted Coefficient (95% CI) p-value Adjusted Coefficient (95% CI) p-value
Charlson index 683.19 (503.06,863.32) < 0.0001 534.6 (354.32,714.89) < 0.0001
Somatic Symptom Index (No. (%)) 786.08 (499.99,1072.2) < 0.0001 285.3 (-58.77,629.37) 0.10389
Race (No. (%)) -421.11 (-580.16,-262.05) < 0.0001 -279.39 (-430.99,-127.78) 0.00032635
Depression Category (No. (%)) -729.13 (-1044.8,-413.42) < 0.0001 -309.72 (-633.39,13.952) 0.06068
IADL score -23.505 (-31.171,-15.839) < 0.0001 -7.9969 (-17.096,1.1021) 0.084814
Female (No. (%)) 401.09 (-64.02,866.19) 0.090816 341.03 (-84.022,766.09) 0.11555
tab_df(table2, file = paste0(analysisDir,"Table2.doc"))
Variable Unadjusted Coefficient (95% CI) p-value Adjusted Coefficient (95% CI) p-value
Charlson index 683.19 (503.06,863.32) < 0.0001 534.6 (354.32,714.89) < 0.0001
Somatic Symptom Index (No. (%)) 786.08 (499.99,1072.2) < 0.0001 285.3 (-58.77,629.37) 0.10389
Race (No. (%)) -421.11 (-580.16,-262.05) < 0.0001 -279.39 (-430.99,-127.78) 0.00032635
Depression Category (No. (%)) -729.13 (-1044.8,-413.42) < 0.0001 -309.72 (-633.39,13.952) 0.06068
IADL score -23.505 (-31.171,-15.839) < 0.0001 -7.9969 (-17.096,1.1021) 0.084814
Female (No. (%)) 401.09 (-64.02,866.19) 0.090816 341.03 (-84.022,766.09) 0.11555

Effect modification

## Evaluate for effect modification by adding an interaction term
cost <- cost %>% mutate(chdep = charlson*depcat)
labelled::var_label(cost$chdep) <- 'Charlson*Depression Category Interaction Term' # label this new term

## Retrain the model with the new interaction term
LinEq <- paste0("outcosts2~",paste0(c(selectedVars[!selectedVars %in% c("ssi")],"chdep"), collapse = "+"))
LinEq
## [1] "outcosts2~charlson+racecat+depcat+iadl+female+chdep"
LinMod <- lm(as.formula(LinEq), data = cost)
summary(LinMod)
## 
## Call:
## lm(formula = as.formula(LinEq), data = cost)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5292.0 -1155.1  -535.6   675.6  8607.8 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3757.230    590.935   6.358  5.1e-10 ***
## charlson     803.949    321.523   2.500 0.012765 *  
## racecat     -287.122     77.519  -3.704 0.000239 ***
## depcat      -310.057    183.192  -1.693 0.091251 .  
## iadl         -11.952      3.998  -2.989 0.002954 ** 
## female       341.680    216.736   1.576 0.115630    
## chdep       -106.097    120.613  -0.880 0.379528    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1995 on 442 degrees of freedom
## Multiple R-squared:  0.1929, Adjusted R-squared:  0.182 
## F-statistic: 17.61 on 6 and 442 DF,  p-value: < 2.2e-16
vif(LinMod)
##  charlson   racecat    depcat      iadl    female     chdep 
## 13.409631  1.059839  1.523035  1.190596  1.020984 13.300878

Figure 1: plot of outpatient costs vs. depression category grouped by depression category

## plot line of best fit for outpatient costs vs charlson grouped by depression category

ggplot(data = cost[,c("outcosts2",selectedVars)], 
       aes(x = charlson, y = outcosts2,group = as.factor(depcat), color = as.factor(depcat))) + 
  geom_smooth(method = 'lm') + 
  ggtitle(" ") + xlab("Charlson Index") + ylab("Outpatient Costs") + 
  labs(color = "Depression \nCategory\n") + 
  scale_color_manual(labels = c("Major","Minor","None"), values = c("red","blue","dark green"))

## Save file
pdf(paste0(analysisDir,"Figure1.pdf"))

ggplot(data = cost[,c("outcosts2",selectedVars)], 
       aes(x = charlson, y = outcosts2,group = as.factor(depcat), color = as.factor(depcat))) + 
  geom_smooth(method = 'lm') + 
  ggtitle(" ") + xlab("Charlson Index") + ylab("Outpatient Costs") + 
  labs(color = "Depression \nCategory\n") + 
  scale_color_manual(labels = c("Major","Minor","None"), values = c("red","blue","dark green"))
 

dev.off()
## quartz_off_screen 
##                 2

Session Info

sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] bindrcpp_0.2.2      leaps_3.0           car_3.0-0          
##  [4] carData_3.0-1       stargazer_5.2.1     sjPlot_2.4.1       
##  [7] tableone_0.9.3      ppcor_1.1           MASS_7.3-49        
## [10] lmSupport_2.9.13    lsmeans_2.27-62     e1071_1.6-8        
## [13] epiR_0.9-97         survival_2.41-3     moments_0.14       
## [16] readxl_1.0.0        knitr_1.20          data.table_1.10.4-3
## [19] forcats_0.3.0       stringr_1.3.0       dplyr_0.7.4        
## [22] purrr_0.2.4         readr_1.1.1         tidyr_0.8.0        
## [25] tibble_1.4.2        ggplot2_2.2.1       tidyverse_1.2.1    
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.1.2    blme_1.0-4         VGAM_1.0-6        
##   [4] plyr_1.8.4         lazyeval_0.2.1     sp_1.2-7          
##   [7] TMB_1.7.13         splines_3.4.3      unmarked_0.12-2   
##  [10] TH.data_1.0-8      digest_0.6.15      htmltools_0.3.6   
##  [13] gdata_2.18.0       magrittr_1.5       AICcmodavg_2.1-1  
##  [16] openxlsx_4.0.17    modelr_0.1.1       sandwich_2.4-0    
##  [19] colorspace_1.3-2   rvest_0.3.2        BiasedUrn_1.07    
##  [22] haven_1.1.1        crayon_1.3.4       jsonlite_1.5      
##  [25] lme4_1.1-17        bindr_0.1.1        zoo_1.8-1         
##  [28] glue_1.2.0         gtable_0.2.0       emmeans_1.1.3     
##  [31] sjstats_0.14.2-3   sjmisc_2.7.1       abind_1.4-5       
##  [34] scales_0.5.0       mvtnorm_1.0-7      ggeffects_0.3.2   
##  [37] Rcpp_0.12.16       xtable_1.8-2       merTools_0.3.0    
##  [40] foreign_0.8-69     stats4_3.4.3       prediction_0.2.0  
##  [43] survey_3.33-2      DT_0.4             htmlwidgets_1.0   
##  [46] httr_1.3.1         gplots_3.0.1       modeltools_0.2-21 
##  [49] pkgconfig_2.0.1    reshape_0.8.7      nnet_7.3-12       
##  [52] utf8_1.1.3         tidyselect_0.2.4   labeling_0.3      
##  [55] rlang_0.2.0        reshape2_1.4.3     later_0.7.4       
##  [58] munsell_0.4.3      cellranger_1.1.0   tools_3.4.3       
##  [61] cli_1.0.0          sjlabelled_1.0.8   broom_0.4.4       
##  [64] ggridges_0.5.0     evaluate_0.10.1    arm_1.9-3         
##  [67] yaml_2.1.18        caTools_1.17.1     coin_1.2-2        
##  [70] nlme_3.1-137       mime_0.5           xml2_1.2.0        
##  [73] compiler_3.4.3     pbkrtest_0.4-7     bayesplot_1.5.0   
##  [76] rstudioapi_0.7     curl_3.2           stringi_1.1.7     
##  [79] highr_0.6          lattice_0.20-35    Matrix_1.2-13     
##  [82] psych_1.8.3.3      nloptr_1.0.4       effects_4.0-1     
##  [85] stringdist_0.9.4.7 pillar_1.2.1       pwr_1.2-2         
##  [88] lmtest_0.9-36      estimability_1.3   bitops_1.0-6      
##  [91] raster_2.6-7       httpuv_1.4.5       R6_2.2.2          
##  [94] promises_1.0.1     KernSmooth_2.23-15 rio_0.5.10        
##  [97] codetools_0.2-15   gtools_3.5.0       assertthat_0.2.0  
## [100] rprojroot_1.3-2    mnormt_1.5-5       multcomp_1.4-8    
## [103] parallel_3.4.3     hms_0.4.2          grid_3.4.3        
## [106] labelled_1.0.1     coda_0.19-1        glmmTMB_0.2.0     
## [109] class_7.3-14       minqa_1.2.4        rmarkdown_1.9     
## [112] snakecase_0.9.1    gvlma_1.0.0.2      shiny_1.1.0       
## [115] lubridate_1.7.3